In [ ]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
import seaborn as sns
In [ ]:
data = pd.read_csv("ai4i2020.csv")
In [ ]:
data.head()
Out[ ]:
| UDI | Product ID | Type | Air temperature [K] | Process temperature [K] | Rotational speed [rpm] | Torque [Nm] | Tool wear [min] | Machine failure | TWF | HDF | PWF | OSF | RNF | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | M14860 | M | 298.1 | 308.6 | 1551 | 42.8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 2 | L47181 | L | 298.2 | 308.7 | 1408 | 46.3 | 3 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 3 | L47182 | L | 298.1 | 308.5 | 1498 | 49.4 | 5 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 4 | L47183 | L | 298.2 | 308.6 | 1433 | 39.5 | 7 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 5 | L47184 | L | 298.2 | 308.7 | 1408 | 40.0 | 9 | 0 | 0 | 0 | 0 | 0 | 0 |
In [ ]:
data.shape
Out[ ]:
(10000, 14)
In [ ]:
from ydata_profiling import ProfileReport
profile = ProfileReport(data, title="Pandas Profiling Report")
In [ ]:
profile
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
Out[ ]:
In [ ]:
data.describe().T
Out[ ]:
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| UDI | 10000.0 | 5000.50000 | 2886.895680 | 1.0 | 2500.75 | 5000.5 | 7500.25 | 10000.0 |
| Air temperature [K] | 10000.0 | 300.00493 | 2.000259 | 295.3 | 298.30 | 300.1 | 301.50 | 304.5 |
| Process temperature [K] | 10000.0 | 310.00556 | 1.483734 | 305.7 | 308.80 | 310.1 | 311.10 | 313.8 |
| Rotational speed [rpm] | 10000.0 | 1538.77610 | 179.284096 | 1168.0 | 1423.00 | 1503.0 | 1612.00 | 2886.0 |
| Torque [Nm] | 10000.0 | 39.98691 | 9.968934 | 3.8 | 33.20 | 40.1 | 46.80 | 76.6 |
| Tool wear [min] | 10000.0 | 107.95100 | 63.654147 | 0.0 | 53.00 | 108.0 | 162.00 | 253.0 |
| Machine failure | 10000.0 | 0.03390 | 0.180981 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
| TWF | 10000.0 | 0.00460 | 0.067671 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
| HDF | 10000.0 | 0.01150 | 0.106625 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
| PWF | 10000.0 | 0.00950 | 0.097009 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
| OSF | 10000.0 | 0.00980 | 0.098514 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
| RNF | 10000.0 | 0.00190 | 0.043550 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
In [ ]:
data.select_dtypes(include=['object']).describe().T
Out[ ]:
| count | unique | top | freq | |
|---|---|---|---|---|
| Product ID | 10000 | 10000 | M14860 | 1 |
| Type | 10000 | 3 | L | 6000 |
In [ ]:
data.columns
Out[ ]:
Index(['UDI', 'Product ID', 'Type', 'Air temperature [K]',
'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]',
'Tool wear [min]', 'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF',
'RNF'],
dtype='object')
In [ ]:
num_cols = ['Air temperature [K]', 'Process temperature [K]',
'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']
cat_cols = ['Type', 'Failure type']
label = 'Machine failure'
In [ ]:
data['Air temperature [K]'][data['Air temperature [K]'].isna().astype(int) == 1]
Out[ ]:
Series([], Name: Air temperature [K], dtype: float64)
In [ ]:
plt.figure(figsize=(12, 12))
for i, col in enumerate(num_cols):
plt.subplot(3, 2, i+1)
sns.histplot(data, x=col, kde=True, alpha=0.2, color='red', bins=15)
plt.title(col)
plt.suptitle("Data Distributions", fontsize=15)
plt.tight_layout()
plt.show()
c:\ProgramData\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
c:\ProgramData\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
c:\ProgramData\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
c:\ProgramData\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
c:\ProgramData\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
In [ ]:
plt.figure(figsize=(10, 7))
for i, col in enumerate(num_cols):
plt.subplot(2, 3, i+1)
sns.rugplot(data, x=col, hue=label, height=0.1)
sns.boxplot(data, x=col, width=0.25)
plt.suptitle("Data Distributions")
plt.tight_layout()
plt.show()
c:\ProgramData\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
c:\ProgramData\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
c:\ProgramData\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
c:\ProgramData\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
c:\ProgramData\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
In [ ]:
plt.figure(figsize=(10, 7))
for i, col in enumerate(num_cols):
plt.subplot(2, 3, i+1)
sns.boxplot(data, x=label, y=col, width=0.5)
plt.suptitle("Data Distribution in Relation to Machine Failure")
plt.tight_layout()
plt.show()
In [ ]:
#plt.figure(figsize = (6,6))
# , square = True, annot = True, cmap = 'Blues', linewidths = 0.5)
sns.heatmap(data[num_cols].corr(), annot=True, fmt=".2f")
plt.title("Heatmap Analysis")
plt.show()
In [ ]:
data[num_cols].corr()
Out[ ]:
| Air temperature [K] | Process temperature [K] | Rotational speed [rpm] | Torque [Nm] | Tool wear [min] | |
|---|---|---|---|---|---|
| Air temperature [K] | 1.000000 | 0.876107 | 0.022670 | -0.013778 | 0.013853 |
| Process temperature [K] | 0.876107 | 1.000000 | 0.019277 | -0.014061 | 0.013488 |
| Rotational speed [rpm] | 0.022670 | 0.019277 | 1.000000 | -0.875027 | 0.000223 |
| Torque [Nm] | -0.013778 | -0.014061 | -0.875027 | 1.000000 | -0.003093 |
| Tool wear [min] | 0.013853 | 0.013488 | 0.000223 | -0.003093 | 1.000000 |
In [ ]:
data.plot.hexbin(x='Air temperature [K]', y='Process temperature [K]',
gridsize=20, cmap='Purples', figsize=(5, 4))
plt.title("Hexbin Plot Between Process Temperature and Air Temperature")
plt.show()
In [ ]:
data.plot.hexbin(x='Rotational speed [rpm]', y='Torque [Nm]',
gridsize=30, cmap='Purples', figsize=(5, 4))
plt.title("Hexbin Plot Between Torque and Rotational speed")
plt.show()
In [ ]:
type_machine_failure = data[['Type', 'Machine failure']].pivot_table(index='Type', columns='Machine failure', aggfunc= lambda x: len(x), margins = True)
print(type_machine_failure)
plt.figure(figsize=(6,6))
sns.heatmap(type_machine_failure, annot=True, fmt='g', cmap='Blues', cbar=False, linewidths=0.5)
plt.title("Type vs Machine Failure")
plt.show()
Machine failure 0 1 All Type H 982 21 1003 L 5765 235 6000 M 2914 83 2997 All 9661 339 10000
In [ ]:
from pycaret.classification import *
s = setup(data, target = 'Machine failure', session_id = 42, data_split_stratify=True)
| Description | Value | |
|---|---|---|
| 0 | Session id | 42 |
| 1 | Target | Machine failure |
| 2 | Target type | Binary |
| 3 | Original data shape | (10000, 14) |
| 4 | Transformed data shape | (10000, 16) |
| 5 | Transformed train set shape | (7000, 16) |
| 6 | Transformed test set shape | (3000, 16) |
| 7 | Numeric features | 11 |
| 8 | Categorical features | 2 |
| 9 | Preprocess | True |
| 10 | Imputation type | simple |
| 11 | Numeric imputation | mean |
| 12 | Categorical imputation | mode |
| 13 | Maximum one-hot encoding | 25 |
| 14 | Encoding method | None |
| 15 | Fold Generator | StratifiedKFold |
| 16 | Fold Number | 10 |
| 17 | CPU Jobs | -1 |
| 18 | Use GPU | False |
| 19 | Log Experiment | False |
| 20 | Experiment Name | clf-default-name |
| 21 | USI | 4ba6 |
In [ ]:
best_model = compare_models(sort = 'AUC')
| Initiated | . . . . . . . . . . . . . . . . . . | 09:37:04 |
|---|---|---|
| Status | . . . . . . . . . . . . . . . . . . | Loading Dependencies |
| Estimator | . . . . . . . . . . . . . . . . . . | Compiling Library |
In [ ]:
plt.figure(figsize = (4,3))
plot_model(best_model, plot = 'confusion_matrix')
In [ ]:
plt.figure(figsize = (5,4))
plot_model(best_model, plot = 'auc')
In [ ]:
plt.figure(figsize = (5, 4))
plot_model(best_model, plot = 'learning')
In [ ]:
plot_model(best_model, plot = 'feature')
In [ ]:
save_model(best_model, "ai4i2020_pycaret_model")
Transformation Pipeline and Model Successfully Saved
Out[ ]:
(Pipeline(memory=Memory(location=None),
steps=[('numerical_imputer',
TransformerWrapper(exclude=None,
include=['UDI', 'Air temperature [K]',
'Process temperature [K]',
'Rotational speed [rpm]',
'Torque [Nm]', 'Tool wear [min]',
'TWF', 'HDF', 'PWF', 'OSF',
'RNF'],
transformer=SimpleImputer(add_indicator=False,
copy=True,
fill_value=None,
keep_empty_features=False,
missing_valu...
TransformerWrapper(exclude=None, include=None,
transformer=CleanColumnNames(match='[\\]\\[\\,\\{\\}\\"\\:]+'))),
('trained_model',
LogisticRegression(C=1.0, class_weight=None, dual=False,
fit_intercept=True, intercept_scaling=1,
l1_ratio=None, max_iter=1000,
multi_class='auto', n_jobs=None,
penalty='l2', random_state=42,
solver='lbfgs', tol=0.0001, verbose=0,
warm_start=False))],
verbose=False),
'ai4i2020_pycaret_model.pkl')
In [ ]:
plot_model(best_model, plot = 'calibration')
In [ ]:
calibrated_model = calibrate_model(best_model)
| Initiated | . . . . . . . . . . . . . . . . . . | 09:38:35 |
|---|---|---|
| Status | . . . . . . . . . . . . . . . . . . | Loading Dependencies |
| Estimator | . . . . . . . . . . . . . . . . . . | Compiling Library |
| Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
|---|---|---|---|---|---|---|---|
| Fold | |||||||
| 0 | 0.9971 | 0.9670 | 0.9130 | 1.0000 | 0.9545 | 0.9531 | 0.9541 |
| 1 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 |
| 2 | 0.9971 | 0.9558 | 0.9130 | 1.0000 | 0.9545 | 0.9531 | 0.9541 |
| 3 | 0.9986 | 1.0000 | 0.9583 | 1.0000 | 0.9787 | 0.9780 | 0.9782 |
| 4 | 0.9971 | 0.9581 | 0.9167 | 1.0000 | 0.9565 | 0.9550 | 0.9560 |
| 5 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 |
| 6 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 |
| 7 | 0.9986 | 1.0000 | 0.9583 | 1.0000 | 0.9787 | 0.9780 | 0.9782 |
| 8 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 |
| 9 | 0.9986 | 0.9767 | 0.9583 | 1.0000 | 0.9787 | 0.9780 | 0.9782 |
| Mean | 0.9987 | 0.9858 | 0.9618 | 1.0000 | 0.9802 | 0.9795 | 0.9799 |
| Std | 0.0012 | 0.0182 | 0.0356 | 0.0000 | 0.0186 | 0.0192 | 0.0188 |
In [ ]:
plot_model(calibrated_model, plot = 'calibration')
In [ ]:
automl()
Out[ ]:
RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
max_iter=None, positive=False, random_state=42, solver='auto',
tol=0.0001)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
max_iter=None, positive=False, random_state=42, solver='auto',
tol=0.0001)In [ ]:
#create_app(best_model)